# Load the main libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sb
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
import re
from wordcloud import WordCloud, STOPWORDS
import nltk
%matplotlib inline
# Set off the warning
pd.set_option('mode.chained_assignment', None)
# To see all columns in my dataset
pd.set_option('display.max_columns', 500)
# Set off the warning in matplotlib
np.warnings.filterwarnings('ignore')
# load in the dataset into a pandas dataframe, I will look at the structure in the wrangling process
df = pd.read_csv('airlines-extract.csv')
#Download stop words
nltk.download('words')
# A bit of cleaning to the data, I will add a dummy column to help me out with the plots
# Make columns lowercase adn remove spaces
df.columns = map(str.lower, df.columns)
df.columns = df.columns.str.replace(' ', '')
df.mentioncreationdate = pd.to_datetime(df.mentioncreationdate)
df.authoraccountcreationdate = pd.to_datetime(df.authoraccountcreationdate)
df['count'] = 1
# Define the function to remove urls from the mention texts
def remove_url(txt):
"""Replace URLs found in a text string with nothing
(i.e. it will remove the URL from the string).
Parameters
----------
txt : string
A text string that you want to parse and remove urls.
Returns
-------
The same txt string with url's removed.
"""
return " ".join(re.sub("([^0-9A-Za-z \t])|(\w+:\/\/\S+)", "", txt).split())
# Fix the full text
df['mentionfulltext'] = df['mentionfulltext'].str.lower()
df['mentionfulltext'] = [remove_url(tweet) for tweet in df.mentionfulltext]
fig = px.pie(df, values= 'count', names='brand')
fig.show()
# I will want to see the spread of sentiments over each brand and see where Egy Air stands
sent_1 = df.groupby(['brand', 'sentiment'])['mentionid'].count()
base = sb.color_palette("GnBu_d", n_colors=3)
sent_1.unstack().plot(kind = 'barh', stacked = True, color = base, figsize = (10,5));
plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), fontsize = 'large');
plt.ylabel('Airline');
plt.xlabel('Tweets');
# Remove the neutral sentiment
a = df.loc[df['sentiment'] != 'Neutral']
sent_2 = a.groupby(['sentiment'], as_index = False)['mentionid'].count()
labels = sent_2['sentiment']
values = sent_2['mentionid']
# Use `hole` to create a donut-like pie chart (Yes, I like donuts more)
fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.6)])
fig.show()
Positive : Negative
Negative slightly more
test = df[df['sentiment'] != 'Neutral'].groupby(['brand','sentiment'], as_index = False)['mentioninteractions'].sum()
# I just converted it to a list to get the values and add them Negative:Positive
test2 = test.values.tolist()
labels = ['Negative','Positive']
# Create subplots, using 'domain' type for pie charts
specs = [[{'type':'domain'}, {'type':'domain'}], [{'type':'domain'}, {'type':'domain'}]]
fig = make_subplots(rows=2, cols=2, specs=specs)
# Define pie charts
fig.add_trace(go.Pie(labels=labels, values=[170680,17783], title='Egypt Air', title_position = 'bottom center', hole = 0.6), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=[6274356, 936221], title='Emirates', title_position = 'bottom center', hole = 0.6), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=[3369308, 2142353], title='Lufthansa', title_position = 'bottom center', hole = 0.6), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=[230678, 16407], title='Saudi Airlines', title_position = 'bottom center', hole = 0.6), 2, 2)
# Tune layout and hover info
fig.update_traces(hoverinfo='label+percent', textinfo='none')
fig = go.Figure(fig)
fig.show()
A deeper look does not sight well for Egypt Air, only 9.4% of tweet mentions provide that customers have had a positive feedback
# Excluding neutrals
# Egypt Air
egy_pos = len(df[(df['brand'] == 'EgyptAir') & (df['sentiment'] == 'Positive')])
egy_neg = len(df[(df['brand'] == 'EgyptAir') & (df['sentiment'] == 'Negative')])
egy_tot = len(df[(df['brand'] == 'EgyptAir') & (df['sentiment'] != 'Neutral')])
egy_air = df[df['brand'] == 'EgyptAir']
egy_pos_sent = (egy_pos/(egy_tot))*100
egy_neg_sent = (egy_neg/(egy_tot))*100
egy_net_sent = (egy_pos_sent - egy_neg_sent)
egy_reach = (egy_air.authorid.nunique()/egy_air.mentionid.count())*100
egy_mention = len(egy_air)
#Emirates
emir_pos = len(df[(df['brand'] == 'Emirates') & (df['sentiment'] == 'Positive')])
emir_neg = len(df[(df['brand'] == 'Emirates') & (df['sentiment'] == 'Negative')])
emir_tot = len(df[(df['brand'] == 'Emirates') & (df['sentiment'] != 'Neutral')])
emir_air = df[df['brand'] == 'Emirates']
emir_pos_sent = (emir_pos/(emir_tot))*100
emir_neg_sent = (emir_neg/(emir_tot))*100
emir_net_sent = (emir_pos_sent - emir_neg_sent)
emir_reach = (emir_air.authorid.nunique()/emir_air.mentionid.count())*100
emir_mention = len(emir_air)
#Lufthansa
luft_pos = len(df[(df['brand'] == 'Lufthansa') & (df['sentiment'] == 'Positive')])
luft_neg = len(df[(df['brand'] == 'Lufthansa') & (df['sentiment'] == 'Negative')])
luft_tot = len(df[(df['brand'] == 'Lufthansa') & (df['sentiment'] != 'Neutral')])
luft_air = df[df['brand'] == 'Lufthansa']
luft_pos_sent = (luft_pos/(luft_tot))*100
luft_neg_sent = (luft_neg/(luft_tot))*100
luft_net_sent = (luft_pos_sent - luft_neg_sent)
luft_reach = (luft_air.authorid.nunique()/luft_air.mentionid.count())*100
luft_mention = len(luft_air)
# Saudi Airlines
saudi_pos = len(df[(df['brand'] == 'Saudi Airlines') & (df['sentiment'] == 'Positive')])
saudi_neg = len(df[(df['brand'] == 'Saudi Airlines') & (df['sentiment'] == 'Negative')])
saudi_tot = len(df[(df['brand'] == 'Saudi Airlines') & (df['sentiment'] != 'Neutral')])
saudi_air = df[df['brand'] == 'Saudi Airlines']
saudi_pos_sent = (saudi_pos/(saudi_tot))*100
saudi_neg_sent = (saudi_neg/(saudi_tot))*100
saudi_net_sent = (saudi_pos_sent - saudi_neg_sent)
saudi_reach = (saudi_air.authorid.nunique()/saudi_air.mentionid.count())*100
saudi_mention = len(saudi_air)
data = {'EgyptAir': [egy_net_sent, egy_reach, egy_mention], 'Emirates': [emir_net_sent, emir_reach, emir_mention],
'Lufthansa': [luft_net_sent, luft_reach, luft_mention], 'SaudiAirlines': [saudi_net_sent, saudi_reach, saudi_mention]}
pd.DataFrame.from_dict(data, orient='index',
columns=['Net Sentiment', 'Reach', 'Mention'])
df['inflight'] = pd.np.where(df['mentionfulltext'].str.contains("entertainment|movie|music"), "entertainment",
pd.np.where(df['mentionfulltext'].str.contains("toilet"), "toilet",
pd.np.where(df['mentionfulltext'].str.contains("service"), "service",
pd.np.where(df['mentionfulltext'].str.contains("smell|odor|odour"), "smell",
pd.np.where(df['mentionfulltext'].str.contains("passenger"), "passenger",
pd.np.where(df['mentionfulltext'].str.contains("crew|aircrew|cabincrew|attendant"), "crew",
pd.np.where(df['mentionfulltext'].str.contains("comfort|seat|seating|space"), "comfort",
pd.np.where(df['mentionfulltext'].str.contains("food|drink|beverage|water|juice|foodie"), "food/drink",
pd.np.where(df['mentionfulltext'].str.contains("trip"), "trip","NaN")))))))))
df['postflight'] = pd.np.where(df['mentionfulltext'].str.contains("board|boarding|check|checkin|onboard"), "boarding",
pd.np.where(df['mentionfulltext'].str.contains("baggage|luggage|lost"), "luggage",
pd.np.where(df['mentionfulltext'].str.contains("cancel|cancellation|cancelation|overbook|delay"), "nuisance",
pd.np.where(df['mentionfulltext'].str.contains("lounge|reception"), "reception",
pd.np.where(df['mentionfulltext'].str.contains("price|cost|charge|money"), "price",
pd.np.where(df['mentionfulltext'].str.contains("compensation"), "compensation",
pd.np.where(df['mentionfulltext'].str.contains("book|online"), "booking",
pd.np.where(df['mentionfulltext'].str.contains("connection|layover|transit"), "transit",
pd.np.where(df['mentionfulltext'].str.contains("offer|special offer"), "offer","NaN")))))))))
egy_air = df[(df['brand'] == 'EgyptAir') & (df['sentiment'] != 'Neutral')]
# Inflight Data
egy_in = egy_air[(egy_air['inflight'] != 'NaN')]
inflight_int = egy_in.groupby(['inflight','sentiment'], as_index = False)['mentioninteractions'].sum()
# Postflight Data
egy_post = egy_air[(egy_air['postflight'] != 'NaN')]
postflight_int = egy_post.groupby(['postflight','sentiment'], as_index = False)['mentioninteractions'].sum()
fig = px.bar(inflight_int, y='mentioninteractions', x='inflight',color = 'sentiment', barmode = 'stack',)
fig.show()
fig = px.bar(postflight_int, y='mentioninteractions', x='postflight',color = 'sentiment', barmode = 'stack')
fig.show()
test = df.groupby(['inflight','sentiment','authorgender','brand'], as_index = False)['mentioninteractions'].sum()
test = test[(test['authorgender'] != 'Unknown') & (test['sentiment'] != 'Neutral') & (test['inflight'] != 'NaN') &
(test['brand']=='EgyptAir') & (test['authorgender'] != 'organization')]
fig = px.treemap(test, path=['authorgender', 'sentiment', 'inflight'], values='mentioninteractions')
fig.show()
test = df.groupby(['postflight','sentiment','authorgender','brand'], as_index = False)['mentioninteractions'].sum()
test = test[(test['authorgender'] != 'Unknown') & (test['sentiment'] != 'Neutral') & (test['postflight'] != 'NaN') &
(test['brand']=='EgyptAir') & (test['authorgender'] != 'organization')]
fig = px.treemap(test, path=['authorgender', 'sentiment', 'postflight'], values='mentioninteractions')
fig.show()
test = df.groupby(['inflight','sentiment','authorcountry','brand'], as_index = False)['mentioninteractions'].sum()
test = test[(test['authorcountry'] != 'Unknown') & (test['sentiment'] != 'Neutral') & (test['inflight'] != 'NaN') &
(test['brand']=='EgyptAir')]
fig = px.treemap(test, path=['authorcountry', 'sentiment', 'inflight'], values='mentioninteractions')
fig.show()
test = df.groupby(['postflight','sentiment','authorcountry','brand'], as_index = False)['mentioninteractions'].sum()
test = test[(test['authorcountry'] != 'Unknown') & (test['sentiment'] != 'Neutral') & (test['postflight'] != 'NaN') &
(test['brand']=='EgyptAir')]
fig = px.treemap(test, path=['authorcountry', 'sentiment', 'postflight'], values='mentioninteractions')
fig.show()
To move forward from this, I would be able to learn more about tag clustering, use it to generate word clouds for socail media hearing. More on the topic of wrangling data from other resources, I will need to revise my Python installation and get back to the old notebooks.
df['etihadmentions'] = df.mentionfulltext.str.contains('etihad|etihadairways|etihad airways')
etihad = df[df['etihadmentions'] == True]
etihad.head()
#etihad.to_csv('etihad_mentions.csv')
In case you need to export the data, remove the hashtag and run the cell.